STATS 506 HW 5-kjaewon

Github Repository: https://github.com/kjaewon-umich/STATS506

Problem 1.

Setup

library(Rcpp)

a.

#' Rational Number Class
#'
#' An S4 class to represent a rational number with a numerator and denominator.
#'
#' @slot numerator An integer representing the numerator of the rational number.
#' @slot denominator An integer representing the denominator of the rational number.
#' @prototype A rational number with numerator `1L` and denominator `1L`.
#' @validity Ensures that the denominator is not zero.
#'
#' @examples
#' r <- rational(3, 4)
#' show(r)
setClass(
  "rational",
  slots = c(numerator = "integer", denominator = "integer"),
  prototype = list(numerator = 1L, denominator = 1L),
  validity = function(object) {
    if (object@denominator == 0L) {
      return("Denominator cannot be zero.")
    }
    TRUE
  }
)

#' Rational Number Constructor
#'
#' Creates an instance of the `rational` class.
#'
#' @param numerator An integer for the numerator. Defaults to `1L`.
#' @param denominator An integer for the denominator. Defaults to `1L`.
#'
#' @return A `rational` object.
#' @examples
#' r <- rational(3, 4)
#' show(r)
rational <- function(numerator = 1L, denominator = 1L) {
  new("rational", numerator = as.integer(numerator), 
      denominator = as.integer(denominator))
}

#' Compute GCD and LCM
#'
#' Functions to compute the greatest common divisor (GCD) and least common 
#' multiple (LCM).
#'
#' @param a An integer.
#' @param b An integer.
#'
#' @return The GCD or LCM of `a` and `b`.
#' @examples
#' gcd(12, 15)  # Output: 3
#' lcm(12, 15)  # Output: 60
cppFunction('
int gcd(int a, int b) {
  while (b != 0) {
    int temp = b;
    b = a % b;
    a = temp;
  }
  return std::abs(a);
}

int lcm(int a, int b) {
  return std::abs(a * b) / gcd(a, b);  // LCM formula using GCD
}
')


#' Simplify a Rational Number
#'
#' A method to simplify a `rational` object by reducing the numerator and 
#' denominator by their GCD.
#'
#' @param x A `rational` object.
#'
#' @return A simplified `rational` object.
#' @examples
#' r <- rational(6, 8)
#' simplify(r)  # Output: 3/4
setGeneric("simplify", function(x) standardGeneric("simplify"))
[1] "simplify"
# Define the simplify method for rational objects
setMethod("simplify", "rational", function(x) {
  divisor <- gcd(x@numerator, x@denominator)
  x@numerator <- as.integer(x@numerator / divisor)
  x@denominator <- as.integer(x@denominator / divisor)
  
  # Ensure the denominator is positive
  if (x@denominator < 0) {
    x@numerator <- as.integer(-x@numerator)
    x@denominator <- as.integer(-x@denominator)
  }
  x
})

#' Display a Rational Number
#'
#' A method to print a `rational` object in the format `numerator/denominator`.
#'
#' @param object A `rational` object.
#'
#' @examples
#' r <- rational(5, 10)
#' show(r)  # Output: 1/2
setMethod("show", "rational", function(object) {
  object <- simplify(object)
  cat(object@numerator, "/", object@denominator, "\n")
})

#' Compute Quotient of a Rational Number
#'
#' A method to compute the floating-point value of a `rational` object.
#'
#' @param x A `rational` object.
#' @param digits An optional integer specifying the number of decimal places to 
#' round to.
#'
#' @return A numeric value representing the quotient.
#' @examples
#' r <- rational(1, 3)
#' quotient(r)         # Output: 0.3333...
#' quotient(r, digits = 2)  # Output: 0.33
setGeneric("quotient", function(x, digits = NULL) standardGeneric("quotient"))
[1] "quotient"
# Update the quotient method for rational objects with rounding
setMethod("quotient", "rational", function(x, digits = NULL) {
  result <- x@numerator / x@denominator
  if (!is.null(digits)) {
    result <- round(result, digits)
  }
  result
})

#' Arithmetic Operations for Rational Numbers
#'
#' Methods for addition, subtraction, multiplication, and division of `rational` 
#' objects.
#'
#' @param e1 A `rational` object.
#' @param e2 A `rational` object.
#'
#' @return A new `rational` object representing the result.
#' @examples
#' r1 <- rational(1, 3)
#' r2 <- rational(1, 6)
#' r1 + r2  # Output: 1/2
#' r1 - r2  # Output: 1/6
#' r1 * r2  # Output: 1/18
#' r1 / r2  # Output: 2
setMethod("+", c("rational", "rational"), function(e1, e2) {
  common_denominator <- e1@denominator * e2@denominator
  new_numerator <- e1@numerator * e2@denominator + e2@numerator * e1@denominator
  simplify(rational(new_numerator, common_denominator))
})

# Define subtraction for two rational objects
setMethod("-", c("rational", "rational"), function(e1, e2) {
  common_denominator <- e1@denominator * e2@denominator
  new_numerator <- e1@numerator * e2@denominator - e2@numerator * e1@denominator
  simplify(rational(new_numerator, common_denominator))
})

setMethod("*", c("rational", "rational"), function(e1, e2) {
  new_numerator <- e1@numerator * e2@numerator
  new_denominator <- e1@denominator * e2@denominator
  simplify(rational(new_numerator, new_denominator))
})

# Define division for two rational objects
setMethod("/", c("rational", "rational"), function(e1, e2) {
  new_numerator <- e1@numerator * e2@denominator
  new_denominator <- e1@denominator * e2@numerator
  simplify(rational(new_numerator, new_denominator))
})

b.

# Testing the class with given test cases
r1 <- rational(24, 6)
r2 <- rational(7, 230)
r3 <- rational(0, 4)
r1
4 / 1 
r3
0 / 1 
r1 + r2
927 / 230 
r1 - r2
913 / 230 
r1 * r2
14 / 115 
r1 / r2
920 / 7 
r1 + r3
4 / 1 
r1 * r3
0 / 1 
r2 / r3
Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'simplify': invalid class "rational" object: Denominator cannot be zero.
quotient(r1)
[1] 4
quotient(r2)
[1] 0.03043478
quotient(r2, digits = 3)
[1] 0.03
quotient(r2, digits = 3.14)
[1] 0.03
quotient(r2, digits = "avocado")
Error in round(result, digits): non-numeric argument to mathematical function
q2 <- quotient(r2, digits = 3)
q2
[1] 0.03
quotient(r3)
[1] 0
simplify(r1)
4 / 1 
simplify(r2)
7 / 230 
simplify(r3)
0 / 1 

c.

#' Rational Number Class
#'
#' An S4 class to represent a rational number with enhanced validation for 
#' the numerator and denominator.
#'
#' @slot numerator An integer representing the numerator of the rational number.
#' @slot denominator An integer representing the denominator of the rational number.
#' @prototype A rational number with numerator `1L` and denominator `1L`.
#' @validity Ensures that:
#'   - The denominator is not zero.
#'   - Neither the numerator nor the denominator is `NA`.
#'   - Both numerator and denominator are integers.
#'
#' @examples
#' # Create a valid rational number
#' r <- rational(3, 4)
#' r
#'
#' # Attempt to create invalid rational numbers
#' try(rational(3, 0))   # Denominator cannot be zero
#' try(rational(NA, 5))  # Numerator cannot be NA
#' try(rational(5.5, 2)) # Numerator must be an integer
#'
setClass(
  "rational",
  slots = c(numerator = "integer", denominator = "integer"),
  prototype = list(numerator = 1L, denominator = 1L),
  validity = function(object) {
    if (object@denominator == 0L) {
      return("Denominator cannot be zero.")
    }
    if (is.na(object@numerator) || is.na(object@denominator)) {
      return("Numerator or denominator cannot be NA.")
    }
    if (!is.integer(object@numerator) || !is.integer(object@denominator)) {
      return("Numerator and denominator must be integers.")
    }
    TRUE
  }
)

#' Rational Number Class
#'
#' An S4 class to represent a rational number with enhanced validation for 
#' the numerator and denominator.
#'
#' @slot numerator An integer representing the numerator of the rational number.
#' @slot denominator An integer representing the denominator of the rational number.
#' @prototype A rational number with numerator `1L` and denominator `1L`.
#' @validity Ensures that:
#'   - The denominator is not zero.
#'   - Neither the numerator nor the denominator is `NA`.
#'   - Both numerator and denominator are integers.
#'
#' @examples
#' # Create a valid rational number
#' r <- rational(3, 4)
#' r
#'
#' # Attempt to create invalid rational numbers
#' try(rational(3, 0))   # Denominator cannot be zero
#' try(rational(NA, 5))  # Numerator cannot be NA
#' try(rational(5.5, 2)) # Numerator must be an integer
#'
rational <- function(numerator = 1L, denominator = 1L) {
  if (!is.integer(numerator)) numerator <- as.integer(numerator)
  if (!is.integer(denominator)) denominator <- as.integer(denominator)
  new("rational", numerator = numerator, denominator = denominator)
}

# Test Cases
try(rational(1, 0))         # Invalid: denominator is zero
Error in validObject(.Object) : 
  invalid class "rational" object: Denominator cannot be zero.
try(rational(1, NA))        # Invalid: denominator is NA
Error in if (object@denominator == 0L) { : 
  missing value where TRUE/FALSE needed
try(rational(NA, 1))        # Invalid: numerator is NA
Error in validObject(.Object) : 
  invalid class "rational" object: Numerator or denominator cannot be NA.
try(rational("1", 2))       # Invalid: numerator is a string
1 / 2 
try(rational(1.5, 2))       # Invalid: numerator is a float
1 / 2 
# Valid Input
r4 <- rational(3, 4)
r4
3 / 4 

Problem 2.

Setup

library(tidyverse)
library(plotly)
art <- read.csv("df_for_ml_improved_new_market.csv", header = TRUE)

a.

From Problem Set 4, we observed that the distribution of art sales genres appears to change across years. To better visualize this trend, we preprocess the data using tidyverse to calculate the proportion of each genre for every year.

# Data Preparation
art$Genre___Others[art$Genre___Painting == 1] <- 0
art$genre <- "Photography"  # Default genre
art$genre[art$Genre___Print == 1] <- "Print"
art$genre[art$Genre___Sculpture == 1] <- "Sculpture"
art$genre[art$Genre___Painting == 1] <- "Painting"
art$genre[art$Genre___Others == 1] <- "Other"

# Create proportions by year and genre
yeargenre <- with(art, table(year, genre))
ygperc <- yeargenre / rowSums(yeargenre)
ygpercm <- as.data.frame(as.table(ygperc))
colnames(ygpercm) <- c("year", "genre", "proportion")

# Ensure genres are factors with the correct order
ygpercm$genre <- factor(ygpercm$genre, 
                        levels = c("Painting", "Sculpture","Photography", 
                                   "Print", "Other"))

To address this question, we utilize the interactive features of the plotly package. The interactive plot highlights the genre name and its proportion of the total sales volume for each year, allowing users to explore the changes in the distribution dynamically.

# Create Plotly Horizontal Stacked Bar Plot
fig <- ygpercm %>%
  plot_ly(
    x = ~proportion,             # Proportion on the x-axis
    y = ~year,                   # Year on the y-axis
    color = ~genre,              # Color by genre
    type = "bar",                # Bar plot type
    orientation = "h",           # Horizontal orientation
    text = ~paste(
      "Genre:", genre, "<br>",
      "Proportion:", round(proportion, 2)
    ),                           # Custom tooltip
    hoverinfo = "text"           # Display custom tooltip
  ) %>%
  layout(
    barmode = "stack",           # Stack the bars
    title = "Proportion of Genre of Art Sales",
    xaxis = list(
      title = "Proportion",
      range = c(0, 1)            # Ensure proportions are bounded between 0 and 1
    ),
    yaxis = list(title = "Year"),
    legend = list(title = list(text = "Genre"))
  )

# Add arrow and text for "Other"
fig <- fig %>%
  add_annotations(
    x = 1.02, y = 15,            # Position of "Other" annotation
    text = "Other",
    showarrow = TRUE,
    ax = -20, ay = 0,            # Adjust arrow length and angle
    arrowhead = 2
  )

# Display the plot
fig

From the visualization, we observe the following trends:

  • Painting shows the most significant decline over time.
  • Sculpture, Photography, and Other genres remain relatively stable.
  • Print exhibits a noticeable increase in its proportion starting around 2000.

b.

From Problem Set 4, we observed a change in sales prices over time, with genre significantly influencing these trends. To better visualize this, we preprocess the data using tidyverse, calculating the median and 97.5th percentile prices by year and genre.

# Data Preparation
artmedian <- aggregate(art$price_usd, by = list(art$year, art$genre), 
                       FUN = median, na.rm = TRUE)
names(artmedian) <- c("year", "genre", "price_usd")

art975 <- aggregate(art$price_usd, by = list(art$year, art$genre), 
                    FUN = quantile, probs = 0.975, na.rm = TRUE)
names(art975) <- c("year", "genre", "price_usd")

# Combine median and 97.5th percentile
artcombine <- bind_rows(
  artmedian %>% mutate(measure = "Median"),
  art975 %>% mutate(measure = "97.5%")
)

To leverage plotly package’s interactive features effectively, we highlight the price, genre, and year to provide detailed insights for each data point.

# Create Interactive Plotly Plot
fig2 <- artcombine %>%
  plot_ly(
    x = ~year,
    y = ~price_usd,
    color = ~genre,
    linetype = ~measure,
    type = "scatter",
    mode = "lines",
    text = ~paste(
      "Year:", year, "<br>",
      "Genre:", genre, "<br>",
      "Measure:", measure, "<br>",
      "Price (USD):", scales::comma(price_usd)
    ),
    hoverinfo = "text"
  ) %>%
  layout(
    title = list(
      text = "Changes in Sales Price by Genre Over Time",
      font = list(size = 16, family = "Arial")
    ),
    xaxis = list(
      title = "Year",
      range = c(1997, 2012),
      tickvals = seq(1997, 2012, 2),
      showgrid = TRUE,
      zeroline = FALSE
    ),
    yaxis = list(
      title = "Price in Thousands USD",
      tickvals = seq(0, 350000, 50000),
      ticktext = paste0(seq(0, 350, 50), "k"),
      showgrid = TRUE,
      zeroline = TRUE
    ),
    legend = list(
      title = list(text = "Genre and Measure"),
      orientation = "v",  # Vertical orientation to avoid overlap
      x = 1.1, y = 1,     # Position the legend outside the plot area
      xanchor = "left",
      yanchor = "top"
    ),
    margin = list(
      l = 80, r = 120, b = 80, t = 80
    )  # Add margins for cleaner spacing
  )

# Display the revised Plotly plot
fig2

From the interactive visualization:

  • Sculpture shows a consistent upward trend throughout the period.
  • Painting exhibited a stable increase until 2007, followed by a sharp spike and a subsequent drastic decline.
  • Photography and Print exhibit significant fluctuations over time.
  • Other genres maintain a relatively stable trend with no notable changes from 2007.

Problem 3.

Setup

library(data.table)
library(nycflights13)

flights <- data.table(flights)
planes <- data.table(planes)

a.

# Delay for Departure
departure <- merge(flights[, faa := origin],
                   airports,
                   by = "faa",
                   all.x = TRUE)
departure[, .(N = .N,
              avg_delay = mean(dep_delay, na.rm = TRUE),
              med_delay = median(dep_delay, na.rm = TRUE)),
          by = name] |>
  _[N >= 10, !"N"] |>
  _[order(avg_delay, decreasing = TRUE)]
                  name avg_delay med_delay
                <char>     <num>     <num>
1: Newark Liberty Intl  15.10795        -1
2: John F Kennedy Intl  12.11216        -1
3:          La Guardia  10.34688        -3
# Delay for Arrival
arrival <- merge(flights[, faa := dest],
                 airports,
                 by = "faa",
                 all.x = TRUE)
arrival[, .(name = ifelse(is.na(first(name)), first(faa), first(name)),
            N = .N,
            avg_delay = mean(arr_delay, na.rm = TRUE),
            med_delay = median(arr_delay, na.rm = TRUE)),
        by = faa] |>
  _[N >= 10, !c("faa", "N")] |>
  _[order(avg_delay, decreasing = TRUE)] |>
  print(x = _, nrows = 10000)
                                     name    avg_delay med_delay
                                   <char>        <num>     <num>
  1:                Columbia Metropolitan  41.76415094      28.0
  2:                           Tulsa Intl  33.65986395      14.0
  3:                    Will Rogers World  30.61904762      16.0
  4:                 Jackson Hole Airport  28.09523810      15.0
  5:                        Mc Ghee Tyson  24.06920415       2.0
  6:               Dane Co Rgnl Truax Fld  20.19604317       1.0
  7:                        Richmond Intl  20.11125320       1.0
  8:        Akron Canton Regional Airport  19.69833729       3.0
  9:                      Des Moines Intl  19.00573614       0.0
 10:                   Gerald R Ford Intl  18.18956044       1.0
 11:                      Birmingham Intl  16.87732342      -2.0
 12:         Theodore Francis Green State  16.23463687       1.0
 13: Greenville-Spartanburg International  15.93544304      -0.5
 14:    Cincinnati Northern Kentucky Intl  15.36456376      -3.0
 15:            Savannah Hilton Head Intl  15.12950601      -1.0
 16:          Manchester Regional Airport  14.78755365      -3.0
 17:                          Eppley Afld  14.69889841      -2.0
 18:                               Yeager  14.67164179      -1.5
 19:                     Kansas City Intl  14.51405836       0.0
 20:                          Albany Intl  14.39712919      -4.0
 21:                General Mitchell Intl  14.16722038       0.0
 22:                       Piedmont Triad  14.11260054      -2.0
 23:               Washington Dulles Intl  13.86420212      -3.0
 24:               Cherry Capital Airport  12.96842105     -10.0
 25:              James M Cox Dayton Intl  12.68048606      -3.0
 26:     Louisville International Airport  12.66938406      -2.0
 27:                  Chicago Midway Intl  12.36422360      -1.0
 28:                      Sacramento Intl  12.10992908       4.0
 29:                    Jacksonville Intl  11.84483416      -2.0
 30:                       Nashville Intl  11.81245891      -2.0
 31:                Portland Intl Jetport  11.66040210      -4.0
 32:               Greater Rochester Intl  11.56064461      -5.0
 33:      Hartsfield Jackson Atlanta Intl  11.30011285      -1.0
 34:                Lambert St Louis Intl  11.07846451      -3.0
 35:                         Norfolk Intl  10.94909344      -4.0
 36:            Baltimore Washington Intl  10.72673385      -5.0
 37:                         Memphis Intl  10.64531435      -2.5
 38:                   Port Columbus Intl  10.60132291      -3.0
 39:                  Charleston Afb Intl  10.59296847      -4.0
 40:                    Philadelphia Intl  10.12719014      -3.0
 41:                  Raleigh Durham Intl  10.05238095      -3.0
 42:                    Indianapolis Intl   9.94043412      -3.0
 43:            Charlottesville-Albemarle   9.50000000      -5.0
 44:               Cleveland Hopkins Intl   9.18161129      -5.0
 45:        Ronald Reagan Washington Natl   9.06695204      -2.0
 46:                      Burlington Intl   8.95099602      -4.0
 47:                 Buffalo Niagara Intl   8.94595186      -5.0
 48:                Syracuse Hancock Intl   8.90392501      -5.0
 49:                          Denver Intl   8.60650021      -2.0
 50:                      Palm Beach Intl   8.56297210      -3.0
 51:                                  BQN   8.24549550      -1.0
 52:                             Bob Hope   8.17567568      -3.0
 53:       Fort Lauderdale Hollywood Intl   8.08212154      -3.0
 54:                          Bangor Intl   8.02793296      -9.0
 55:           Asheville Regional Airport   8.00383142      -1.0
 56:                                  PSE   7.87150838       0.0
 57:                      Pittsburgh Intl   7.68099053      -5.0
 58:                       Gallatin Field   7.60000000      -2.0
 59:                 NW Arkansas Regional   7.46572581      -2.0
 60:                           Tampa Intl   7.40852503      -4.0
 61:               Charlotte Douglas Intl   7.36031885      -3.0
 62:             Minneapolis St Paul Intl   7.27016886      -5.0
 63:                      William P Hobby   7.17618819      -4.0
 64:                         Bradley Intl   7.04854369     -10.0
 65:                     San Antonio Intl   6.94537178      -9.0
 66:                      South Bend Rgnl   6.50000000      -3.5
 67:     Louis Armstrong New Orleans Intl   6.49017497      -6.0
 68:                        Key West Intl   6.35294118       7.0
 69:                        Eagle Co Rgnl   6.30434783      -4.0
 70:                Austin Bergstrom Intl   6.01990875      -5.0
 71:                   Chicago Ohare Intl   5.87661475      -8.0
 72:                         Orlando Intl   5.45464309      -5.0
 73:               Detroit Metro Wayne Co   5.42996346      -7.0
 74:                        Portland Intl   5.14157973      -5.0
 75:                        Nantucket Mem   4.85227273      -3.0
 76:                      Wilmington Intl   4.63551402      -7.0
 77:                    Myrtle Beach Intl   4.60344828     -13.0
 78:    Albuquerque International Sunport   4.38188976      -5.5
 79:         George Bush Intercontinental   4.24079040      -5.0
 80:        Norman Y Mineta San Jose Intl   3.44817073      -7.0
 81:               Southwest Florida Intl   3.23814963      -5.0
 82:                       San Diego Intl   3.13916574      -5.0
 83:              Sarasota Bradenton Intl   3.08243131      -5.0
 84:            Metropolitan Oakland Intl   3.07766990      -9.0
 85:   General Edward Lawrence Logan Intl   2.91439222      -9.0
 86:                   San Francisco Intl   2.67289152      -8.0
 87:                                  SJU   2.52052659      -6.0
 88:                         Yampa Valley   2.14285714       2.0
 89:              Phoenix Sky Harbor Intl   2.09704733      -6.0
 90:            Montrose Regional Airport   1.78571429     -10.5
 91:                     Los Angeles Intl   0.54711094      -7.0
 92:               Dallas Fort Worth Intl   0.32212685      -9.0
 93:                           Miami Intl   0.29905978      -9.0
 94:                       Mc Carran Intl   0.25772849      -8.0
 95:                  Salt Lake City Intl   0.17625459      -8.0
 96:                           Long Beach  -0.06202723     -10.0
 97:                Martha\\\\'s Vineyard  -0.28571429     -11.0
 98:                  Seattle Tacoma Intl  -1.09909910     -11.0
 99:                        Honolulu Intl  -1.36519258      -7.0
100:                                  STT  -3.83590734      -9.0
101:            John Wayne Arpt Orange Co  -7.86822660     -11.0
102:                    Palm Springs Intl -12.72222222     -13.5
                                     name    avg_delay med_delay

b.

merged <- merge(flights,
                planes,
                by = "tailnum",
                all.x = TRUE)

avg_speed <- merged[, `:=`(flights_num = .N,
                           avg_mph = mean(distance/(air_time/60), na.rm = TRUE)),
                    by = model]
avg_speed[avg_speed[, .I[which.max(avg_mph)]],.(model, avg_mph, flights_num)]
     model  avg_mph flights_num
    <char>    <num>       <int>
1: 777-222 482.6254           4